import re
import csv
import codecs
import string
import datetime
import os
from bs4 import BeautifulSoup

def dehtml(text):
    soup=BeautifulSoup(text)
    return soup.get_text()


unnecessary = {'\n':''}
def removeunnecessary(text):
    for i, j in unnecessary.iteritems():
        text = text.replace(i, j)
        return text

#File names

leg11folder='/Users/bde254/Desktop/FranceAssemblyPlenary/11/cri/html/'
leg11files=[leg11folder+file for file in os.listdir(leg11folder)]
leg12folder='/Users/bde254/Desktop/FranceAssemblyPlenary/12/'
leg12files=[leg12folder+file for file in os.listdir(leg12folder)]
leg13folder='/Users/bde254/Desktop/FranceAssemblyPlenary/13/'
leg13files=[leg13folder+file for file in os.listdir(leg13folder)]

allfiles=leg11files+leg12files+leg13files

#Read and clean up the files

texts=[]

for file in allfiles:
	text={}
	name=file.split('/')[-1]
	print name
	text['name']=name
	htmltext=codecs.open(file,"r",encoding="latin-1").read()
	text['text']=dehtml(htmltext)
	texts.append(text)

votecounts=[]	
for text in texts:
	votecount={}
	numbervotes=len(re.findall('mets aux voix',text['text']))
	votecount['filename']=text['name']
	votecount['votecount']=numbervotes
	votecounts.append(votecount)

totalcount=0
for votecount in votecounts:
    totalcount+=votecount['votecount']




